We don't need your loop

by serge_sans_paille

Quarkslab / Télécom Bretagne / Namek

PyConFR 2015

Une brève histoire de la boucle

Assembleur

.L3:
    addsd   (%rsi), %xmm0
    addq    $8, %rsi
    cmpq    %rax, %rsi
    jne .L3

FORTRAN

do 10 i = 1, n
         sum = sum + A(i)
  10  continue

C

for(unsigned i = 0; i < n ; ++i)
  sum += A[i];

APL

+/ A

Python


In [2]:
import numpy
n = 10
A = numpy.random.random(n)
print(A)


[ 0.46102614  0.56142032  0.60601388  0.38648678  0.2613231   0.82951922
  0.55104271  0.72755349  0.84486497  0.69278971]

Fortran style


In [2]:
s = 0
for i in range(n):
    s += A[i]
print(s)


3.83779068785

APL style


In [3]:
s = numpy.sum(A)
print(s)


3.83779068785

Une préférence ?

Question de performances

Boucles explicites


In [4]:
n = 1000000
A = numpy.random.random(n)

def explicit_sum(seq):
    s = 0
    for elem in seq:
        s += elem ** 2
    return s

%timeit explicit_sum(A)


1 loops, best of 3: 252 ms per loop

In [6]:
%timeit numpy.sum(A**2)


100 loops, best of 3: 1.84 ms per loop

In [ ]:
### Cython

In [41]:
%load_ext Cython

In [8]:
%%cython
cimport numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def cython_sum(np.ndarray[double, ndim=1] A):
    cdef double s = 0
    cdef int i, n
    n = len(A)
    for i in range(n):
        s += A[i] * A[i]
    return s

In [9]:
%timeit cython_sum(A)


1000 loops, best of 3: 883 µs per loop

In [11]:
%%file pythran_sum.py

#pythran export pythran_sum(float64[])

import numpy
def pythran_sum(A):
    return numpy.sum(A**2)


Overwriting pythran_sum.py

In [12]:
!python -m pythran.run pythran_sum.py -DUSE_BOOST_SIMD -O3 -march=native


WARNING  Your pythranrc has an obsolete `user' section

In [14]:
!python -m timeit -s 'from pythran_sum import pythran_sum; import numpy; n = 1000000 ; A = numpy.random.random(n)' 'pythran_sum(A)'


1000 loops, best of 3: 401 usec per loop

We don't need your loops!

  • Un appel de fonction a plus de sens qu'une boucle pour un humain
  • Un appel de fonction a plus de sens qu'une boucle pour un compilateur

Exemples de boucles amenées à disparaitre


In [16]:
n, m = 100, 200
A = numpy.random.random((n,m))

In [19]:
s = 0.
for i in range(1, n-1):
    for j in range(1, m-1):
        s += A[i,j]
print(s)


9714.74017815

In [21]:
print(numpy.sum(A[1:-1, 1:-1]))


9714.74017815

In [22]:
s = 0
for i in range(n):
    for j in range(m):
        if A[i,j] < .5:
            s += 1
print(s)


10043

In [23]:
print(numpy.sum(A < .5))


10043

In [25]:
s = 0
B = numpy.empty_like(A)
for i in range(n):
    for j in range(m):
        if A[i,j] < .5:
            B[i,j] = A[i,j]
        else:
            B[i,j] = 0.
print(B)


[[ 0.          0.24422838  0.38341328 ...,  0.43256797  0.          0.        ]
 [ 0.06229533  0.12058999  0.35258651 ...,  0.          0.          0.26504433]
 [ 0.          0.37721507  0.         ...,  0.          0.          0.45327171]
 ..., 
 [ 0.10379296  0.14472365  0.26055037 ...,  0.25811615  0.46467391
   0.05963935]
 [ 0.          0.          0.         ...,  0.          0.39666637
   0.34696423]
 [ 0.          0.46156258  0.         ...,  0.40101053  0.          0.        ]]

In [26]:
print(numpy.where(A<.5,A,0.))


[[ 0.          0.24422838  0.38341328 ...,  0.43256797  0.          0.        ]
 [ 0.06229533  0.12058999  0.35258651 ...,  0.          0.          0.26504433]
 [ 0.          0.37721507  0.         ...,  0.          0.          0.45327171]
 ..., 
 [ 0.10379296  0.14472365  0.26055037 ...,  0.25811615  0.46467391
   0.05963935]
 [ 0.          0.          0.         ...,  0.          0.39666637
   0.34696423]
 [ 0.          0.46156258  0.         ...,  0.40101053  0.          0.        ]]

In [27]:
### Piège!

In [33]:
n = 100
B = numpy.arange(100)
shift = 3
for i in range(shift, n):
        B[i] = 1 + B[i - shift]
print(B)


[ 0  1  2  1  2  3  2  3  4  3  4  5  4  5  6  5  6  7  6  7  8  7  8  9  8
  9 10  9 10 11 10 11 12 11 12 13 12 13 14 13 14 15 14 15 16 15 16 17 16 17
 18 17 18 19 18 19 20 19 20 21 20 21 22 21 22 23 22 23 24 23 24 25 24 25 26
 25 26 27 26 27 28 27 28 29 28 29 30 29 30 31 30 31 32 31 32 33 32 33 34 33]

In [35]:
B = numpy.arange(100)
B[shift:] = 1 + B[:-shift]
print(B)


[ 0  1  2  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97]

Les scientifiquent codent en Numpy de haut niveau !


In [45]:
%%file grayscott.py
#pythran export GrayScott(int, float, float, float, float)

import numpy as np
def GrayScott(counts, Du, Dv, F, k):
    n = 300
    U = np.zeros((n+2,n+2), dtype=np.float32)
    V = np.zeros((n+2,n+2), dtype=np.float32)
    u, v = U[1:-1,1:-1], V[1:-1,1:-1]

    r = 20
    u[:] = 1.0
    U[n/2-r:n/2+r,n/2-r:n/2+r] = 0.50
    V[n/2-r:n/2+r,n/2-r:n/2+r] = 0.25

    u += 0.15*np.random.random((n,n))
    v += 0.15*np.random.random((n,n))

    for i in range(counts):
        Lu = (                 U[0:-2,1:-1] +
              U[1:-1,0:-2] - 4*U[1:-1,1:-1] + U[1:-1,2:] +
                               U[2:  ,1:-1] )
        Lv = (                 V[0:-2,1:-1] +
              V[1:-1,0:-2] - 4*V[1:-1,1:-1] + V[1:-1,2:] +
                               V[2:  ,1:-1] )
        uvv = u*v*v
        u += Du*Lu - uvv + F*(1 - u)
        v += Dv*Lv + uvv - (F + k)*v

    return V


Overwriting grayscott.py

In [38]:
from grayscott import GrayScott

In [39]:
%timeit GrayScott(40, 0.16, 0.08, 0.04, 0.06)


10 loops, best of 3: 49.4 ms per loop

In [42]:
%%cython
cimport cython
import numpy as np
cimport numpy as np

cpdef CythonGrayScott(int counts, double Du, double Dv, double F, double k):
    cdef int n = 300
    cdef np.ndarray U = np.zeros((n+2,n+2), dtype=np.float_)
    cdef np.ndarray V = np.zeros((n+2,n+2), dtype=np.float_)
    cdef np.ndarray u = U[1:-1,1:-1]
    cdef np.ndarray v = V[1:-1,1:-1]

    cdef int r = 20
    u[:] = 1.0
    U[n/2-r:n/2+r,n/2-r:n/2+r] = 0.50
    V[n/2-r:n/2+r,n/2-r:n/2+r] = 0.25
    u += 0.15*np.random.random((n,n))
    v += 0.15*np.random.random((n,n))

    cdef np.ndarray Lu = np.zeros_like(u)
    cdef np.ndarray Lv = np.zeros_like(v)
    cdef int i, c, r1, c1, r2, c2
    cdef double uvv

    cdef double[:, ::1] bU = U
    cdef double[:, ::1] bV = V
    cdef double[:, ::1] bLu = Lu
    cdef double[:, ::1] bLv = Lv

    for i in range(counts):
        for r in range(n):
            r1 = r + 1
            r2 = r + 2
            for c in range(n):
                c1 = c + 1
                c2 = c + 2
                bLu[r,c] = bU[r1,c2] + bU[r1,c] + bU[r2,c1] + bU[r,c1] - 4*bU[r1,c1]
                bLv[r,c] = bV[r1,c2] + bV[r1,c] + bV[r2,c1] + bV[r,c1] - 4*bV[r1,c1]

        for r in range(n):
            r1 = r + 1
            for c in range(n):
                c1 = c + 1
                uvv = bU[r1,c1]*bV[r1,c1]*bV[r1,c1]
                bU[r1,c1] += Du*bLu[r,c] - uvv + F*(1 - bU[r1,c1])
                bV[r1,c1] += Dv*bLv[r,c] + uvv - (F + k)*bV[r1,c1]
    return V

In [61]:
%timeit GrayScott(40, 0.16, 0.08, 0.04, 0.06)
%timeit CythonGrayScott(40, 0.16, 0.08, 0.04, 0.06)


10 loops, best of 3: 50.3 ms per loop
10 loops, best of 3: 37.9 ms per loop

In [62]:
!python -m pythran.run -O3 -march=native grayscott.py -o pythran_grayscott.so


WARNING  Your pythranrc has an obsolete `user' section

In [63]:
! python -m timeit -s 'from pythran_grayscott import GrayScott' 'GrayScott(40, 0.16, 0.08, 0.04, 0.06)'


10 loops, best of 3: 33.1 msec per loop

Optimisations faites par Pythran

  • Fusion de boucle (expression template + forward subsitution)
  • Propagation de constantes interprocédurales
  • Élimination de code mort
  • Évaluation paresseuse
  • substitution de motifs
  • Déroulage de boucles étendues
  • Vectorisation (génération de code SIMD)

Fonctionnalités supportées par Pythran

  • Python2.7
  • Exception
  • List/set/dict comprehension
  • Générateurs, generator expression
  • Récursion
  • lambda, fonctions imbriquées, fermetures
  • type destructuring

Mais pas

  • Python3 (plus compliqué qu'il n'y parait!)
  • Classes utilisateurs
  • Code implicitement non statiquement typé
  • Gestionnaires with

Paquets, modules et fonctions supportés

  • __builtin__, math, cmath
  • bisect, functools (juste partial en fait), itertools
  • operator, random, string, time
  • numpy (dont numpy.random mais pas numpy.linalg)

Séduits ? Pas convaincus ? Essayez le !


In [ ]: